import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
data = pd.read_csv("spotifyData.csv")
data.head()
| Artist | Track | Album | Album_type | Danceability | Energy | Loudness | Speechiness | Acousticness | Instrumentalness | ... | Title | Channel | Views | Likes | Comments | Licensed | official_video | Stream | EnergyLiveness | most_playedon | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Gorillaz | Feel Good Inc. | Demon Days | album | 0.818 | 0.705 | -6.679 | 0.1770 | 0.008360 | 0.002330 | ... | Gorillaz - Feel Good Inc. (Official Video) | Gorillaz | 693555221.0 | 6220896.0 | 169907.0 | True | True | 1.040235e+09 | 1.150082 | Spotify |
| 1 | Gorillaz | Rhinestone Eyes | Plastic Beach | album | 0.676 | 0.703 | -5.815 | 0.0302 | 0.086900 | 0.000687 | ... | Gorillaz - Rhinestone Eyes [Storyboard Film] (... | Gorillaz | 72011645.0 | 1079128.0 | 31003.0 | True | True | 3.100837e+08 | 15.183585 | Spotify |
| 2 | Gorillaz | New Gold (feat. Tame Impala and Bootie Brown) | New Gold (feat. Tame Impala and Bootie Brown) | single | 0.695 | 0.923 | -3.930 | 0.0522 | 0.042500 | 0.046900 | ... | Gorillaz - New Gold ft. Tame Impala & Bootie B... | Gorillaz | 8435055.0 | 282142.0 | 7399.0 | True | True | 6.306347e+07 | 7.956897 | Spotify |
| 3 | Gorillaz | On Melancholy Hill | Plastic Beach | album | 0.689 | 0.739 | -5.810 | 0.0260 | 0.000015 | 0.509000 | ... | Gorillaz - On Melancholy Hill (Official Video) | Gorillaz | 211754952.0 | 1788577.0 | 55229.0 | True | True | 4.346636e+08 | 11.546875 | Spotify |
| 4 | Gorillaz | Clint Eastwood | Gorillaz | album | 0.663 | 0.694 | -8.627 | 0.1710 | 0.025300 | 0.000000 | ... | Gorillaz - Clint Eastwood (Official Video) | Gorillaz | 618480958.0 | 6197318.0 | 155930.0 | True | True | 6.172597e+08 | 9.942693 | Youtube |
5 rows × 24 columns
Our data consists of both qualitative and quantitative data
data.shape
(20594, 24)
The data has more than 10,000 rows and 24 columns
data.describe()
| Danceability | Energy | Loudness | Speechiness | Acousticness | Instrumentalness | Liveness | Valence | Tempo | Duration_min | Views | Likes | Comments | Stream | EnergyLiveness | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 20594.000000 | 20594.000000 | 20594.000000 | 20594.000000 | 20594.000000 | 20594.000000 | 20594.000000 | 20594.000000 | 20594.000000 | 20594.000000 | 2.059400e+04 | 2.059400e+04 | 2.059400e+04 | 2.059400e+04 | 20592.000000 |
| mean | 0.620102 | 0.635176 | -7.678254 | 0.096733 | 0.291391 | 0.056162 | 0.193653 | 0.530077 | 120.562616 | 3.742439 | 9.203740e+07 | 6.479902e+05 | 2.684679e+04 | 1.326446e+08 | 5.167227 |
| std | 0.165504 | 0.214274 | 4.639481 | 0.112182 | 0.286117 | 0.193622 | 0.168832 | 0.245542 | 29.588093 | 2.085211 | 2.726026e+08 | 1.773648e+06 | 1.911751e+05 | 2.423582e+08 | 4.117431 |
| min | 0.000000 | 0.000000 | -46.251000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000000e+00 | 0.000049 |
| 25% | 0.519000 | 0.507000 | -8.868000 | 0.035700 | 0.045200 | 0.000000 | 0.094100 | 0.340000 | 96.994000 | 2.996746 | 1.478284e+06 | 1.754200e+04 | 4.060000e+02 | 1.559098e+07 | 2.386190 |
| 50% | 0.638000 | 0.666000 | -6.540500 | 0.050650 | 0.193000 | 0.000002 | 0.125000 | 0.538000 | 119.959000 | 3.551267 | 1.331348e+07 | 1.153155e+05 | 3.006000e+03 | 4.730525e+07 | 4.256881 |
| 75% | 0.741000 | 0.798000 | -4.935000 | 0.104000 | 0.476750 | 0.000474 | 0.237000 | 0.727000 | 139.923500 | 4.202163 | 6.739682e+07 | 5.000198e+05 | 1.373675e+04 | 1.343453e+08 | 6.822034 |
| max | 0.975000 | 1.000000 | 0.920000 | 0.964000 | 0.996000 | 1.000000 | 1.000000 | 0.993000 | 243.372000 | 77.934300 | 8.079649e+09 | 5.078865e+07 | 1.608314e+07 | 3.386520e+09 | 59.113924 |
A track with the mean, maximum and minimum views were 92037400, 8079649000 and 0 views,
data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 20594 entries, 0 to 20593 Data columns (total 24 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Artist 20594 non-null object 1 Track 20594 non-null object 2 Album 20594 non-null object 3 Album_type 20594 non-null object 4 Danceability 20594 non-null float64 5 Energy 20594 non-null float64 6 Loudness 20594 non-null float64 7 Speechiness 20594 non-null float64 8 Acousticness 20594 non-null float64 9 Instrumentalness 20594 non-null float64 10 Liveness 20594 non-null float64 11 Valence 20594 non-null float64 12 Tempo 20594 non-null float64 13 Duration_min 20594 non-null float64 14 Title 20594 non-null object 15 Channel 20594 non-null object 16 Views 20594 non-null float64 17 Likes 20594 non-null float64 18 Comments 20594 non-null float64 19 Licensed 20594 non-null object 20 official_video 20594 non-null object 21 Stream 20594 non-null float64 22 EnergyLiveness 20592 non-null float64 23 most_playedon 20594 non-null object dtypes: float64(15), object(9) memory usage: 3.8+ MB
Our data has both qualitative data(Artist,Album,Album) and quantitative data
Checking if there is any null value in our dataset
data.isnull().sum()
Artist 0 Track 0 Album 0 Album_type 0 Danceability 0 Energy 0 Loudness 0 Speechiness 0 Acousticness 0 Instrumentalness 0 Liveness 0 Valence 0 Tempo 0 Duration_min 0 Title 0 Channel 0 Views 0 Likes 0 Comments 0 Licensed 0 official_video 0 Stream 0 EnergyLiveness 2 most_playedon 0 dtype: int64
Lets drop the null values in the column EnergyLiveness
data = data.dropna()
data.isnull().sum()
Artist 0 Track 0 Album 0 Album_type 0 Danceability 0 Energy 0 Loudness 0 Speechiness 0 Acousticness 0 Instrumentalness 0 Liveness 0 Valence 0 Tempo 0 Duration_min 0 Title 0 Channel 0 Views 0 Likes 0 Comments 0 Licensed 0 official_video 0 Stream 0 EnergyLiveness 0 most_playedon 0 dtype: int64
missingValue = ["??","na","nan","x","?",""]
data = data.replace(missingValue,np.NaN)
data = data.dropna()
data.isnull().sum()
Artist 0 Track 0 Album 0 Album_type 0 Danceability 0 Energy 0 Loudness 0 Speechiness 0 Acousticness 0 Instrumentalness 0 Liveness 0 Valence 0 Tempo 0 Duration_min 0 Title 0 Channel 0 Views 0 Likes 0 Comments 0 Licensed 0 official_video 0 Stream 0 EnergyLiveness 0 most_playedon 0 dtype: int64
corr = data[["Danceability","Energy","Loudness","Speechiness","Acousticness","Instrumentalness","Liveness","Valence","Likes","Views","Stream","Comments"]].corr()
sns.set(style = "darkgrid")
plt.figure(figsize = (12,8))
sns.heatmap(corr,annot = True,fmt = ".0%")
<Axes: >
sns.scatterplot(x= "Energy",y="Loudness",data =data)
plt.title("Correlation between Energy and Loudness")
plt.show()
sns.scatterplot(x= "Energy",y="Acousticness",data =data)
plt.title("Correlation between Energy and Acousticness")
plt.show()
distplot = data[["Danceability","Energy","Loudness","Speechiness","Acousticness","Instrumentalness","Liveness","Valence","Likes","Views","Stream","Comments"]]
sns.pairplot(distplot)
<seaborn.axisgrid.PairGrid at 0x7fce6d0e7610>
plt.pie(data["most_playedon"].value_counts(),labels = data["most_playedon"].value_counts().index,autopct = "%.2f%%")
plt.title("Count of most_playedon for both Youtube and Spotify")
plt.show()
ax= sns.countplot(x= "most_playedon",data = data,palette = "bright")
for label in ax.containers:
ax.bar_label(label)
plt.title("Count of most_playedon for both Youtube and Spotify",fontsize =13)
plt.show()
ax = sns.countplot(x = "most_playedon",hue = "Licensed",palette = "bright",data = data )
for label in ax.containers:
ax.bar_label(label)
plt.title("Count of licensed an not licensed tracks from each platform",fontsize = 14)
plt.show()
data[["Artist","Energy","Acousticness","Views"]].groupby("Artist").mean().sort_values(by="Views",ascending = False)[:10]
| Energy | Acousticness | Views | |
|---|---|---|---|
| Artist | |||
| Ed Sheeran | 0.6089 | 0.308300 | 1.546021e+09 |
| CoComelon | 0.4002 | 0.529600 | 1.460167e+09 |
| Katy Perry | 0.7655 | 0.041185 | 1.312063e+09 |
| Charlie Puth | 0.5650 | 0.381210 | 1.216759e+09 |
| Luis Fonsi | 0.7417 | 0.314690 | 1.162811e+09 |
| Justin Bieber | 0.6003 | 0.373010 | 1.099106e+09 |
| Daddy Yankee | 0.8212 | 0.133920 | 1.087193e+09 |
| Bruno Mars | 0.6256 | 0.197940 | 1.024092e+09 |
| Macklemore & Ryan Lewis | 0.6699 | 0.223420 | 1.012206e+09 |
| Coldplay | 0.5704 | 0.207006 | 9.997278e+08 |
z = data[["Artist","Energy","Acousticness","Views"]].groupby("Artist").mean().sort_values(by="Views",ascending = False)[:10]
plt.figure(figsize = (20,10))
ax = sns.barplot(x=z.index,y= "Views",data =z)
for label in ax.containers:
ax.bar_label(label)
plt.title("Top 10 Artist with respect to their views",fontsize = 20)
plt.show()
data[["Artist","Energy","Acousticness","Stream"]].groupby("Artist").mean().sort_values(by="Stream",ascending = False)[:10]
| Energy | Acousticness | Stream | |
|---|---|---|---|
| Artist | |||
| Post Malone | 0.6350 | 0.221320 | 1.525126e+09 |
| Ed Sheeran | 0.6089 | 0.308300 | 1.439488e+09 |
| Dua Lipa | 0.7347 | 0.045934 | 1.340808e+09 |
| The Weeknd | 0.6425 | 0.124516 | 1.303197e+09 |
| XXXTENTACION | 0.5075 | 0.398617 | 1.233362e+09 |
| Justin Bieber | 0.6003 | 0.373010 | 1.209777e+09 |
| Imagine Dragons | 0.7174 | 0.095532 | 1.185831e+09 |
| Coldplay | 0.5704 | 0.207006 | 1.177848e+09 |
| Khalid | 0.5937 | 0.291010 | 1.138684e+09 |
| Bruno Mars | 0.6256 | 0.197940 | 1.089786e+09 |
top15 = data[["Artist","Track","Energy","Valence","Stream","Views"]].groupby(["Track","Artist"]).mean().sort_values(by="Views",ascending = False)[:15]
data[["Artist","Track","Energy","Valence","Stream","Views"]].groupby(["Track","Artist"]).mean().sort_values(by="Views",ascending = False)[:15]
| Energy | Valence | Stream | Views | ||
|---|---|---|---|---|---|
| Track | Artist | ||||
| Despacito | Luis Fonsi | 0.797 | 0.839 | 1.506598e+09 | 8.079649e+09 |
| Daddy Yankee | 0.797 | 0.839 | 1.506598e+09 | 8.079647e+09 | |
| Shape of You | Ed Sheeran | 0.652 | 0.931 | 3.362005e+09 | 5.908398e+09 |
| See You Again (feat. Charlie Puth) | Charlie Puth | 0.481 | 0.283 | 1.521255e+09 | 5.773798e+09 |
| Wiz Khalifa | 0.481 | 0.283 | 1.521255e+09 | 5.773797e+09 | |
| Wheels on the Bus | CoComelon | 0.387 | 0.965 | 8.343436e+07 | 4.898831e+09 |
| Uptown Funk (feat. Bruno Mars) | Mark Ronson | 0.609 | 0.928 | 1.653820e+09 | 4.821016e+09 |
| Gangnam Style (?????) | PSY | 0.937 | 0.749 | 3.709911e+08 | 4.679767e+09 |
| Sugar | Maroon 5 | 0.788 | 0.884 | 1.502781e+09 | 3.817733e+09 |
| Roar | Katy Perry | 0.771 | 0.436 | 8.847210e+08 | 3.725749e+09 |
| Counting Stars | OneRepublic | 0.705 | 0.477 | 1.805320e+09 | 3.721610e+09 |
| Sorry | Justin Bieber | 0.760 | 0.410 | 1.740759e+09 | 3.627306e+09 |
| Thinking out Loud | Ed Sheeran | 0.445 | 0.591 | 2.154334e+09 | 3.547156e+09 |
| Baa Baa Black Sheep | CoComelon | 0.396 | 0.969 | 4.655435e+07 | 3.486504e+09 |
| Waka Waka (This Time for Africa) [The Official 2010 FIFA World Cup (TM) Song] (feat. Freshlyground) | Shakira | 0.871 | 0.753 | 6.299185e+08 | 3.463816e+09 |
top10 = data[["Track","Energy","Valence","Stream","Views"]].groupby(["Track"]).mean().sort_values(by="Views",ascending = False)[:10]
plt.figure(figsize = (27,20))
ax = sns.barplot(x=top10.index,y= "Views",data =top10)
for label in ax.containers:
ax.bar_label(label)
plt.title("Top 10 Tracks with respect to their views",fontsize = 30)
plt.show()
topValenceTrack=data[["Artist","Energy","Valence","Stream","Views"]].groupby(["Artist"]).mean().sort_values(by="Views",ascending = False)[:15]
topValenceTrack=topValenceTrack.sort_values(by = "Valence",ascending=False)
topValenceTrack
| Energy | Valence | Stream | Views | |
|---|---|---|---|---|
| Artist | ||||
| CoComelon | 0.4002 | 0.82380 | 3.690822e+07 | 1.460167e+09 |
| Daddy Yankee | 0.8212 | 0.68220 | 5.135462e+08 | 1.087193e+09 |
| Luis Fonsi | 0.7417 | 0.64480 | 5.054326e+08 | 1.162811e+09 |
| Calvin Harris | 0.8579 | 0.63190 | 9.548542e+08 | 9.758476e+08 |
| Charlie Puth | 0.5650 | 0.61090 | 7.041187e+08 | 1.216759e+09 |
| Katy Perry | 0.7655 | 0.60140 | 6.607329e+08 | 1.312063e+09 |
| Bruno Mars | 0.6256 | 0.59974 | 1.089786e+09 | 1.024092e+09 |
| Ed Sheeran | 0.6089 | 0.56600 | 1.439488e+09 | 1.546021e+09 |
| Justin Bieber | 0.6003 | 0.56130 | 1.209777e+09 | 1.099106e+09 |
| BLACKPINK | 0.7681 | 0.53370 | 4.015972e+08 | 9.392962e+08 |
| Eminem | 0.7826 | 0.51860 | 9.748588e+08 | 9.119324e+08 |
| Macklemore & Ryan Lewis | 0.6699 | 0.50833 | 3.982978e+08 | 1.012206e+09 |
| Imagine Dragons | 0.7174 | 0.45197 | 1.185831e+09 | 9.093785e+08 |
| DJ Snake | 0.7901 | 0.30885 | 7.739463e+08 | 9.180268e+08 |
| Coldplay | 0.5704 | 0.29060 | 1.177848e+09 | 9.997278e+08 |
plt.figure(figsize = (25,10))
ax = sns.barplot(x=topValenceTrack.index,y= "Valence",data =topValenceTrack)
for label in ax.containers:
ax.bar_label(label)
plt.title("Top 10 Tracks with respect to their Positive Energy",fontsize = 30)
plt.show()
topSongEnergy = data[["Track","Energy","Valence","Stream","Views"]].groupby(["Track"]).mean().sort_values(by="Views",ascending = False)[:10]
topSongEnergy=topSongEnergy.sort_values(by="Energy",ascending = False)
plt.figure(figsize = (25,10))
ax = sns.barplot(x=topSongEnergy.index,y= "Energy",data =topSongEnergy)
for label in ax.containers:
ax.bar_label(label)
plt.title("Top 10 Tracks with respect to Energy",fontsize = 30)
plt.show()